Sweep enrichments

We look for enrichments in all the 90%-regions

In [1]:
import re, os, sys, pickle, pickle
from pathlib import Path
import numpy
import scipy
import pandas
from pandas import DataFrame, Series
from sklearn.decomposition import PCA
from collections import Counter, defaultdict
import random, bisect

random.seed(7)

import pyfaidx

# my own libaries
from GenomicWindows import window
import GenomicIntervals

numpy.random.seed(7)

Plotting setup:

In [2]:
%matplotlib inline

# Make inline plots vector graphics instead of raster graphics
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('pdf', 'svg')

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D 
from mpl_toolkits.basemap import Basemap
#matplotlib.rcParams['figure.figsize'] = (20.0, 10.0)

import mpld3

import seaborn as sns
sns.set() # sets seaborn default "prettyness:
sns.set_style("whitegrid")
sns.set_context("paper")

# lowess for plotting
from statsmodels.nonparametric.smoothers_lowess import lowess

set1 = {'red': '#e41a1c', 'blue': '#377eb8', 'green': '#4daf4a',
        'purple': '#984ea3', 'orange': '#ff7f00', 
        'yellow': '#ffff33', 'brown': '#a65628'}

Ignore deprecation warnings from mainly seaborn:

In [3]:
# silence deprecation warnings (lots from seaborn)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=numpy.VisibleDeprecationWarning)

Analysis dirs:

In [4]:
root_dir = Path(os.environ['HOME'], 'simons/faststorage/people/kmt')
meta_data_dir = Path(os.environ['HOME'], 'simons/faststorage/data/metadata')
steps_dir = root_dir / 'steps'
argweaver_dir = steps_dir / 'argweaver/output'
results_dir = root_dir / 'results'
figures_dir = root_dir / 'figures'
data_dir = root_dir / 'data'
pi_dir = steps_dir / 'pi_stores'
dist_dir = steps_dir / 'dist_stores'
#pi_dir = root_dir / 'old_pi_stores'
male_x_haploid_dir = steps_dir / 'male_x_haploids'

Local code in the scripts dir on the cluster:

In [5]:
scripts_dir = root_dir / 'scripts'
if str(scripts_dir) not in sys.path:
    sys.path.append(str(scripts_dir))

import simons_meta_data
import hg19_chrom_sizes

from toggle_code_and_errors import toggle_code_html, toggle_errors_html

Import variables global to the entire analysis:

In [6]:
import analysis_globals

Convenience functions

In [7]:
def silent_nanmean(x):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return numpy.nanmean(x)
    
def ident_scalar(s):
    x = s.unique()
    assert(len(x)) == 1, x
    return x[0]

Load meta data

In [8]:
# easy loading of meta data in a consistent manner across code
individuals, populations, regions = simons_meta_data.get_meta_data(meta_data_dir=meta_data_dir)

pop_categories = pandas.read_hdf(str(results_dir / 'population_categories.store'), 'sr')
region_categories = pandas.read_hdf(str(results_dir / 'region_categories.store'), 'sr')

region_colors = dict(zip(list(region_categories), 
                         ['#e41a1c', '#377eb8',  '#984ea3', '#4daf4a',
                          '#ff7f00', '#ffff33', '#a65628']))

chromosome_lengths = dict((k.replace('chr', ''), v) for k, v in hg19_chrom_sizes.hg19_chrom_sizes.items())

reference_genome_file = Path('/home', 'kmt', 'simons', 
                        'faststorage', 'cteam_lite_public3', 'FullyPublic', 'Href.fa')
In [9]:
def genes_overlapping_intervals(genes, peaks):    
    lst = list()
    for tup in peaks.itertuples():
        df = genes.copy().loc[(genes.start < tup.end) & (genes.end > tup.start)]
        df['peak_start'] = tup.start
        df['peak_end'] = tup.end
        lst.append(df)
    return pandas.concat(lst)
        
def genes_in_intervals(genes, peaks):
    pos = genes.start + (genes.end-genes.start)/2
    idx = numpy.equal(numpy.searchsorted(peaks.start, pos) - 1, numpy.searchsorted(peaks.end, pos, side='left'))
    return genes.copy().iloc[idx]

Sweep peak and region data

In [10]:
sweep_peaks = pandas.read_hdf(results_dir / 'sweep_peaks.hdf')
In [11]:
extended_peak_regions_subset = (pandas.read_hdf(results_dir / 'extended_peak_regions_90%.hdf')
                        )
extended_peak_regions_subset.head()
extended_peak_regions_subset['start'] = extended_peak_regions_subset.start_pos
extended_peak_regions_subset['end'] = extended_peak_regions_subset.end_pos

Biomart gene annotation on chrX

In [12]:
biomart_genes_x = pandas.read_hdf(results_dir / 'biomart_genes.hdf').loc[lambda df: df.chrom == 'X']
print(len(biomart_genes_x))
biomart_genes_x.head()
2392
Out[12]:
Gene stable ID chrom start end strand Gene type name
3215 ENSG00000102309 X 71401203 71522776 1 protein_coding PIN4
3325 ENSG00000186871 X 71424510 71458897 -1 protein_coding ERCC6L
3382 ENSG00000198034 X 71475529 71497150 -1 protein_coding RPS4X
3594 ENSG00000102100 X 48760459 48769235 -1 protein_coding SLC35A2
3815 ENSG00000102096 X 48770459 48776301 -1 protein_coding PIM2

Testis genes from human protein atlas

2237 genes show elevated expression in the testis compared to other tissues:

In [13]:
hpatlas_testis_elevated = pandas.read_table(data_dir / 'tissue_specificity_rna_testis_elevated.tsv')
testis_elevated_genes = biomart_genes_x.loc[biomart_genes_x.name.isin(hpatlas_testis_elevated.Gene)]
len(testis_elevated_genes)
Out[13]:
191

1079 enriched have at least five-fold higher mRNA levels compared to all other tissues

In [14]:
hpatlas_testis_enriched = pandas.read_table(data_dir / 'tissue_specificity_rna_testis_Tissue.tsv')
testis_enriched_genes = biomart_genes_x.loc[biomart_genes_x.name.isin(hpatlas_testis_enriched.Gene)]
len(testis_enriched_genes)
Out[14]:
136
In [15]:
overlap = genes_overlapping_intervals(testis_elevated_genes, extended_peak_regions_subset)
overlap
Out[15]:
Gene stable ID chrom start end strand Gene type name peak_start peak_end
28692 ENSG00000147081 X 49955406 49965664 -1 protein_coding AKAP4 49500000 50000000
29201 ENSG00000147082 X 49967364 50094909 1 protein_coding CCNB3 49500000 50000000
30740 ENSG00000122824 X 51075083 51080377 1 protein_coding NUDT10 50800000 51300000
30795 ENSG00000187690 X 51149767 51151687 1 protein_coding CXorf67 50800000 51300000
30809 ENSG00000196368 X 51232863 51239448 -1 protein_coding NUDT11 50800000 51300000
47112 ENSG00000196632 X 54219256 54385075 -1 protein_coding WNK3 54000000 54400000
50027 ENSG00000189299 X 55649833 55652621 1 protein_coding FOXR2 55300000 55800000
50784 ENSG00000204071 X 101395448 101397942 -1 protein_coding TCEAL6 100900000 101400000
58085 ENSG00000123496 X 114238538 114254540 -1 protein_coding IL13RA2 114000000 114300000
40148 ENSG00000123165 X 127184943 127186382 -1 protein_coding ACTRT1 126800000 127400000
48249 ENSG00000076770 X 131503345 131623996 -1 protein_coding MBNL3 131200000 131600000
44907 ENSG00000203870 X 154051623 154062937 -1 protein_coding SMIM9 153900000 154400000
45207 ENSG00000198082 X 154113317 154113833 1 protein_coding H2AFB1 153900000 154400000

Fisher's exact test show no enrichment:

In [16]:
overlap_all = genes_overlapping_intervals(biomart_genes_x, extended_peak_regions_subset)

nr_prot_overlapping = len(overlap_all.loc[lambda df: df['Gene type'] == 'protein_coding'])
nr_prot_biomart = len(biomart_genes_x.loc[lambda df: df['Gene type'] == 'protein_coding'])

table = [[len(overlap), len(testis_elevated_genes)-len(overlap)],
         [nr_prot_overlapping, nr_prot_biomart-nr_prot_overlapping]]
print(table)

#scipy.stats.fisher_exact([[20, 100],[100, 1000]], alternative='greater')
scipy.stats.fisher_exact(table, alternative='greater')
[[13, 178], [88, 742]]
Out[16]:
(0.61580694586312568, 0.96189309344982155)

All overlapping biomart genes

In [17]:
overlap_all['Gene type'].value_counts()
Out[17]:
protein_coding          88
pseudogene              72
miRNA                   21
snRNA                   12
lincRNA                 11
misc_RNA                 8
antisense                7
snoRNA                   5
sense_intronic           4
processed_transcript     2
sense_overlapping        1
rRNA                     1
Name: Gene type, dtype: int64

All overlapping biomart non-coding genes

In [18]:
with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    display(overlap_all
            #.loc[lambda df: df['Gene type'] == 'protein_coding']
            .loc[lambda df: (df['Gene type'] != 'protein_coding') & (df['Gene type'] != 'pseudogene')]
            .sort_values(['start']))
Gene stable ID chrom start end strand Gene type name peak_start peak_end
47346 ENSG00000234129 X 10865996 11129261 -1 lincRNA RP11-120D5.1 11100000 11500000
47352 ENSG00000207151 X 11134212 11134313 -1 misc_RNA Y_RNA 11100000 11500000
47356 ENSG00000263652 X 11336734 11336806 -1 miRNA MIR548AX 11100000 11500000
47660 ENSG00000221278 X 14783097 14783239 -1 miRNA AC003658.1 14700000 14900000
48163 ENSG00000206663 X 20003176 20003277 -1 misc_RNA Y_RNA 19600000 20200000
48165 ENSG00000264566 X 20035206 20035305 -1 miRNA MIR23C 19600000 20200000
48168 ENSG00000201882 X 20154184 20154253 -1 snoRNA snoU2-30 19600000 20200000
48170 ENSG00000201592 X 20154424 20154503 -1 snoRNA snoU2_19 19600000 20200000
48171 ENSG00000225037 X 20158086 20158562 1 antisense EIF1AX-AS1 19600000 20200000
48173 ENSG00000206716 X 21232755 21232861 1 snRNA RNU6-133P 21100000 21600000
48180 ENSG00000266822 X 21581190 21581281 1 miRNA AL928874.1 21100000 21600000
49075 ENSG00000206733 X 36040056 36040162 -1 snRNA RNU6-641P 36000000 36400000
49079 ENSG00000226484 X 36383741 36458375 -1 antisense RP11-87M18.2 36000000 36400000
6638 ENSG00000234390 X 49641327 49643844 -1 lincRNA USP27X-AS1 49500000 50000000
6671 ENSG00000201341 X 49709947 49710053 -1 snRNA RNU6-421P 49500000 50000000
6951 ENSG00000207758 X 49767754 49767844 1 miRNA MIR532 49500000 50000000
6958 ENSG00000207768 X 49768109 49768194 1 miRNA MIR188 49500000 50000000
6969 ENSG00000265152 X 49771262 49771344 1 miRNA AF222686.1 49500000 50000000
6979 ENSG00000207785 X 49773039 49773122 1 miRNA MIR500A 49500000 50000000
6992 ENSG00000208015 X 49773572 49773636 1 miRNA MIR362 49500000 50000000
7006 ENSG00000211538 X 49774330 49774413 1 miRNA MIR501 49500000 50000000
7014 ENSG00000239057 X 49775279 49775357 1 miRNA MIR500B 49500000 50000000
7023 ENSG00000207970 X 49777849 49777945 1 miRNA MIR660 49500000 50000000
7031 ENSG00000272080 X 49779206 49779291 1 miRNA MIR502 49500000 50000000
7056 ENSG00000202495 X 49935841 49935941 -1 misc_RNA Y_RNA 49500000 50000000
10589 ENSG00000230317 X 50838690 50914232 -1 lincRNA RP11-104D21.3 50800000 51300000
10684 ENSG00000229151 X 51099796 51139314 -1 antisense RP11-348F1.3 50800000 51300000
10696 ENSG00000226530 X 51139363 51208513 1 lincRNA RP11-348F1.2 50800000 51300000
10714 ENSG00000234766 X 51244942 51253857 -1 lincRNA RP11-56H2.2 50800000 51300000
19921 ENSG00000252175 X 54091278 54091399 -1 snoRNA U3 54000000 54400000
19952 ENSG00000207104 X 54369979 54370085 1 snRNA RNU6-434P 54000000 54400000
20906 ENSG00000232765 X 55307804 55315218 1 processed_transcript RP11-382F24.2 55300000 55800000
20936 ENSG00000266328 X 55477928 55478015 1 miRNA hsa-mir-4536-2 55300000 55800000
20940 ENSG00000200635 X 55609166 55609267 1 misc_RNA Y_RNA 55300000 55800000
22224 ENSG00000235437 X 62563106 62780951 -1 processed_transcript RP11-357C3.3 62700000 63100000
23058 ENSG00000231729 X 62890076 62891382 -1 sense_intronic ARHGEF9-IT1 62700000 63100000
23078 ENSG00000222532 X 63005882 63005967 -1 miRNA MIR1468 62700000 63100000
25614 ENSG00000206646 X 67580917 67581029 1 misc_RNA Y_RNA 67400000 67800000
25639 ENSG00000206747 X 67759285 67759391 -1 snRNA RNU6-245P 67400000 67800000
28529 ENSG00000270641 X 73012040 73049066 1 lincRNA TSIX 73000000 73500000
28566 ENSG00000229807 X 73040486 73072588 -1 lincRNA XIST 73000000 73500000
29032 ENSG00000225470 X 73164159 73290243 1 lincRNA JPX 73000000 73500000
29160 ENSG00000228906 X 73168808 73169393 1 lincRNA RP13-216E22.4 73000000 73500000
29271 ENSG00000271199 X 73177894 73189320 -1 sense_overlapping RP13-216E22.5 73000000 73500000
29293 ENSG00000230590 X 73183790 73513409 -1 lincRNA FTX 73000000 73500000
29490 ENSG00000270223 X 73229224 73231303 -1 sense_intronic RP13-36G14.4 73000000 73500000
30052 ENSG00000271430 X 73420064 73461983 -1 sense_intronic RP3-368A4.5 73000000 73500000
30081 ENSG00000271533 X 73429811 73433495 -1 sense_intronic RP3-368A4.6 73000000 73500000
30087 ENSG00000202566 X 73438212 73438296 -1 miRNA MIR421 73000000 73500000
30091 ENSG00000212027 X 73438382 73438453 -1 miRNA MIR374B 73000000 73500000
30107 ENSG00000265727 X 73462445 73462736 -1 misc_RNA RN7SL648P 73000000 73500000
30738 ENSG00000200906 X 77092786 77092892 1 snRNA RNU6-854P 76700000 77400000
30739 ENSG00000263410 X 77140874 77141127 1 misc_RNA RN7SL460P 76700000 77400000
6368 ENSG00000252430 X 98640193 98640274 -1 miRNA AL109750.1 98500000 98900000
8122 ENSG00000212584 X 100946553 100946658 1 snRNA RNU6-587P 100900000 101400000
8267 ENSG00000252810 X 101170562 101170668 1 snRNA RNU6-345P 100900000 101400000
16072 ENSG00000206864 X 104920064 104920170 -1 snRNA RNU6-207P 104700000 105000000
16678 ENSG00000227610 X 106756213 106789051 -1 antisense FRMPD3-AS1 106700000 106900000
17399 ENSG00000251954 X 110664738 110664836 -1 snRNA RNU6-496P 110200000 111100000
17461 ENSG00000260802 X 110754890 110765627 1 lincRNA LINC00890 110200000 111100000
17562 ENSG00000266829 X 110859125 110859401 1 misc_RNA RN7SL661P 110200000 111100000
17706 ENSG00000201420 X 110913057 110913176 -1 rRNA RNA5SP512 110200000 111100000
17716 ENSG00000229487 X 110949877 110954329 -1 antisense ALG13-AS1 110200000 111100000
20214 ENSG00000199001 X 114058017 114058127 1 miRNA MIR448 114000000 114300000
20338 ENSG00000222122 X 114266990 114267096 1 snRNA RNU6-648P 114000000 114300000
10519 ENSG00000228659 X 130115564 130192120 -1 antisense RP1-23K20.2 129700000 130200000
12636 ENSG00000232160 X 131351175 131566890 1 antisense RAP2C-AS1 131200000 131600000
12958 ENSG00000206765 X 132665751 132665854 -1 snRNA RNU6-203P 132600000 132900000
13001 ENSG00000211495 X 132880775 132880873 -1 miRNA AL009174.1 132600000 132900000
11350 ENSG00000206948 X 153996803 153996934 1 snoRNA SNORA36A 153900000 154400000
11399 ENSG00000206693 X 154003273 154003401 1 snoRNA SNORA56 153900000 154400000
11469 ENSG00000221533 X 154115635 154115733 -1 miRNA MIR1184-1 153900000 154400000

Overlapping Chalmel genes

In [19]:
chalmel_genes = pandas.read_hdf(results_dir / 'chalmel_genes.hdf')
In [20]:
df = chalmel_genes.loc[lambda df: (df.Pattern.isin(['9', '10', '11', '12', '13'])) & (df.chrom == 'X'), 
                  ['name', 'chrom', 'start', 'end', 'Pattern', 'Expression in Testis']]

genes_overlapping_intervals(df, extended_peak_regions_subset)
Out[20]:
name chrom start end Pattern Expression in Testis peak_start peak_end
2638 MAP7D2 X 20024831 20135035 9 PET 19600000 20200000
3357 XK X 37545012 37591383 10 IE 37200000 37700000
26 AKAP4 X 49955406 49965664 13 SEHET 49500000 50000000
3018 PAGE4 X 49593863 49598576 10 IE 49500000 50000000
2879 CXorf67 X 51149767 51151687 9 SET 50800000 51300000
5740 XIST X 73040486 73072588 9 UE 73000000 73500000
379 IL13RA2 X 114238538 114254540 11 IE 114000000 114300000

Overlapping circRNA from Trine Line

In [22]:
trine_line_x_genes = pandas.read_hdf(results_dir / 'trine_line_x_genes.hdf')

genes_overlapping_intervals(trine_line_x_genes, extended_peak_regions_subset)
Out[22]:
chr start end ccName ovlpRepExon ovlpRepRange strand size sumReads max median mean sumExp Pattern Expression.in.Testis peak_start peak_end
7200 chrX 19983161 19988416 chrX_19983161-19988416 CXorf23 CXorf23 - 5255 3 3 0.0 0.250000 1 - - 19600000 20200000
7209 chrX 43515173 43515263 chrX_43515173-43515263 NaN NaN - 90 2 2 0.0 0.166667 1 NaN NaN 43100000 43600000
7212 chrX 49962215 49963355 chrX_49962215-49963355 AKAP4 AKAP4 - 1140 2 2 0.0 0.166667 1 13 SEHET 49500000 50000000
7216 chrX 54013510 54014377 chrX_54013510-54014377 PHF8 PHF8 - 867 2 2 0.0 0.166667 1 NaN NaN 54000000 54400000

GO enrichment analysis on Gorilla web server

In [21]:
with open(str(results_dir / 'candidate_gene_list.txt'), 'w') as f:
    for name in overlap_all.loc[lambda df: df['Gene type'] == 'protein_coding'].name:
        print(name, file=f)
        
with open(str(results_dir / 'bacground_gene_list.txt'), 'w') as f:
    for name in biomart_genes_x.loc[lambda df: df['Gene type'] == 'protein_coding'].name:
        print(name, file=f)

Overlap to ampliconic genes

In [22]:
ampliconic_regions = pandas.read_hdf(results_dir / 'ampliconic_regions.hdf')
In [23]:
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'), 
                                  ampliconic_regions.loc[lambda df: df.chrom == 'X'], 
                                  chromosome_sizes=chromosome_lengths, 
                                  samples=1000)
Out[23]:
(0.0061704338859588752, 0.86)

Non-random distance to ampliconic reginos not overlapping: Are ampliconic regions not overlapping low pi regions closer the low pi regions than expected?

In [24]:
ampl_reg_not_overlapping = GenomicIntervals.interval_diff(ampliconic_regions.loc[lambda df: df.chrom == 'X'],
                                                          extended_peak_regions_subset.assign(chrom = 'X'))
GenomicIntervals.distance_stat(ampl_reg_not_overlapping, extended_peak_regions_subset.assign(chrom = 'X'))
Out[24]:
(0.24316666666666689, 0.036)

Overlap to low ILS regions

In [25]:
human_chimp_low_ils_regions_chrX = pandas.read_hdf(results_dir / 'human_chimp_low_ils_regions_chrX.hdf')
human_orang_low_ils_regions_chrX = pandas.read_hdf(results_dir / 'human_orang_low_ils_regions_chrX.hdf')

Human-chimp ILS:

In [26]:
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'), 
                                  human_chimp_low_ils_regions_chrX, 
                                  chromosome_sizes=chromosome_lengths, 
                                  samples=100000)
p-value is zero smaller than 1e-05. Increase nr samples to get actual p-value.
Out[26]:
(0.17391304347826086, 0.0)

Human-orang ILS:

In [27]:
GenomicIntervals.interval_jaccard(extended_peak_regions_subset.assign(chrom = 'X'), 
                                  human_orang_low_ils_regions_chrX, 
                                  chromosome_sizes=chromosome_lengths, 
                                  samples=1000)
Out[27]:
(0.072555205047318619, 0.258)

Total swept with annotations

In [28]:
sweep_data = pandas.read_hdf(results_dir / 'sweep_data.hdf')
missing_regions = pandas.read_hdf(results_dir / 'missing_regions.hdf')

plot_df = (sweep_data
           .groupby(['start', 'end', 'region_1', 'region_label_1'])['swept']
           .aggregate(['sum', 'size'])
           .rename(columns={'sum': 'nr_swept', 'size': 'total'})
           .reset_index(level=['start', 'end', 'region_1', 'region_label_1'])
          )

plot_df.sort_values(by=['start', 'region_1'], inplace=True)

plot_df['cum_nr_swept'] = (plot_df
                           .loc[plot_df.region_label_1 != 'Africa']
                           .groupby(['start', 'end'])['nr_swept']
                           .transform('cumsum')    
                           )
plot_df['cum_total'] = (plot_df
                        .loc[plot_df.region_label_1 != 'Africa']
                        .groupby(['start', 'end'])['total']
                        .transform('sum')    
                        )
plot_df.head(7)
Out[28]:
start end region_1 region_label_1 nr_swept total cum_nr_swept cum_total
0 0 100000 Africa Africa 0.0 22 NaN NaN
1 0 100000 WestEurasia WestEurasia 0.0 48 0.0 140.0
2 0 100000 SouthAsia SouthAsia 0.0 31 0.0 140.0
3 0 100000 CentralAsiaSiberia CentralAsiaSiberia 0.0 10 0.0 140.0
4 0 100000 Oceania Oceania 0.0 16 0.0 140.0
5 0 100000 EastAsia EastAsia 0.0 27 0.0 140.0
6 0 100000 America America 0.0 8 0.0 140.0
In [29]:
class ClickInfo(mpld3.plugins.PluginBase):
    """mpld3 Plugin for getting info on click        """

    JAVASCRIPT = """
    mpld3.register_plugin("clickinfo", ClickInfo);
    ClickInfo.prototype = Object.create(mpld3.Plugin.prototype);
    ClickInfo.prototype.constructor = ClickInfo;
    ClickInfo.prototype.requiredProps = ["id", "urls"];
    function ClickInfo(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    ClickInfo.prototype.draw = function(){
        var obj = mpld3.get_element(this.props.id);
        urls = this.props.urls;
        obj.elements().on("mousedown",
                          function(d, i){ 
                            window.open(urls[i], '_blank')});
    }
    """
    def __init__(self, points, urls):
        self.points = points
        self.urls = urls
        if isinstance(points, matplotlib.lines.Line2D):
            suffix = "pts"
        else:            
            suffix = None
        self.dict_ = {"type": "clickinfo",
                      "id": mpld3.utils.get_id(points, suffix),
                      "urls": urls}

#ucsc_search = "https://genome-euro.ucsc.edu/cgi-bin/hgTracks?hgsid=226837763_FKVw0jsAcbutCxMf8luSHlzwx2xW&org=Human&db=hg37&position={}&pix=1361"
ucsc_search = "https://genome-euro.ucsc.edu/cgi-bin/hgTracks?db=hg19&pix=2000&position={}&hgsid=230604418_7ASTqKyqNTW8yadJActGtpWwO1V2"
    
# fig, ax = plt.subplots()
# points = ax.scatter(numpy.random.rand(50), numpy.random.rand(50),
#                     s=500, alpha=0.3)
# urls = [ucsc_search.format('CCR5') for i in range(50)]

# # mpld3.plugins.connect(fig, ClickInfo(points, urls))
# # mpld3.display(fig)
In [30]:
with sns.axes_style("whitegrid", {'axes.grid' : False}):
#     fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True, figsize=(13, 9),                         
#                                    subplot_kw={'xlim':(0, chromosome_lengths['X']), 'ylim':(0, 1)})

    fig, ax1 = plt.subplots(1, 1, sharex=True, sharey=True, figsize=(13, 9),                         
                                   subplot_kw={'xlim':(0, chromosome_lengths['X']), 'ylim':(0, 1.1)})


    zorder = 1

    zorder += 1
    genes = biomart_genes_x
    pos_list = list()
    labels = list()
    for tup in genes.itertuples():
        x = tup.start + (tup.end - tup.start) / 2
        pos_list.append(x)
        labels.append("{}".format(tup.name))
        ax1.add_line(Line2D([x, x], [0, 1.1], color='lightgrey', zorder=zorder))
    zorder += 1        
    scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='lightgrey', s=50, zorder=zorder)
    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)

    urls = [ucsc_search.format(tup.name) for tup in genes.itertuples()]
    mpld3.plugins.connect(fig, ClickInfo(scatter, urls))


#     zorder += 1        
#     genes = chalmel_genes_subset
#     pos_list = list()
#     labels = list()
#     for tup in genes.itertuples():
#         x = tup.start + (tup.end - tup.start) / 2
#         pos_list.append(x)
#         labels.append("{} {}".format(tup.name, tup.Pattern))
#         ax1.add_line(Line2D([x, x], [0, 1.1], color='red', zorder=zorder))
#     zorder += 1        
#     scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='red', s=50, zorder=zorder)
#     tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
#     mpld3.plugins.connect(fig, tooltip)
    
#     urls = [ucsc_search.format(tup.name) for tup in genes.itertuples()]
#     mpld3.plugins.connect(fig, ClickInfo(scatter, urls))

    
#     zorder += 1        
#     genes = trine_line_x_genes
#     pos_list = list()
#     labels = list()
#     for tup in genes.itertuples():
#         x = tup.start + (tup.end - tup.start) / 2
#         pos_list.append(x)
#         labels.append("{}".format(tup.ovlpRepExon))
#         ax1.add_line(Line2D([x, x], [0, 1.1], color='pink', zorder=zorder))
#     zorder += 1        
#     scatter = ax1.scatter(pos_list, [1.05 for x in pos_list], c='pink', s=50, zorder=zorder)
#     tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
#     mpld3.plugins.connect(fig, tooltip)
    
#     urls = [ucsc_search.format(tup.ovlpRepExon) for tup in genes.itertuples()]
#     mpld3.plugins.connect(fig, ClickInfo(scatter, urls))

    
    regs = [x for x in plot_df.region_1.cat.categories if x != 'Africa'][::-1]
    for reg in regs:
        df = plot_df.loc[plot_df.region_label_1 == reg]
        zorder += 1        
        for tup in df.itertuples():
            if tup.nr_swept:
                g = ax1.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, tup.cum_nr_swept/tup.cum_total, 
                                  facecolor=region_colors[reg], 
                                  linewidth=0,
                                  edgecolor=None,#region_colors[reg], 
                                  zorder=zorder))

#     df = plot_df.loc[plot_df.region_label_1 == 'Africa']
#     for tup in df.itertuples():
#         if tup.nr_swept:
#             g = ax2.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, tup.nr_swept/tup.total, 
#                               facecolor=region_colors['Africa'], 
#                               edgecolor=None,#region_colors[reg], 
#                               ))

    zorder += 1        
    for tup in missing_regions.loc[missing_regions.is_missing == True].itertuples():
        g = ax1.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, 1, 
                 facecolor='lightgray', 
                 #edgecolor=None,
                  linewidth=0,
                 alpha=0.5,
                 zorder=zorder))
#         g = ax2.add_patch(Rectangle((tup.start, 0), tup.end-tup.start, 1, 
#                  facecolor='lightgray', 
#                  edgecolor=None,
#                  alpha=0.5,
#                  zorder=zorder))

  
    plt.savefig(str(figures_dir / "tmp2.pdf"))
    #plt.close() # closing teh plot suppres automatic plotting without plt.show()

mpld3.display(fig)
Out[30]:
In [ ]: